import pandas as pd
import numpy as np
from multiprocessing import Pool
import re
from pyhanlp import HanLP # If cannot install pyhanlp, install jieba instead, then substitute the jieba.lcut() function for HanLP.segment().
import time

lex = pd.read_csv('Final Datasets/DLUT_reshaped_final.csv', low_memory=False)
# lex = pd.read_csv('Final Datasets/DLUT_reshaped_final_Synonym_Expanded.csv', low_memory=False)


emoWordList = lex['word'].values
emoWordList = emoWordList.tolist()

del lex

# df = pd.read_csv('Final Datasets/df7_ch_for emo split.csv', low_memory=False)
# df = df.head(1000)

def splitChText2Sentences(text):
    # Define the pattern for separators
    pattern = r'[； .;,，。]+'
    
    # Split the text by the defined separators
    sentences = re.split(pattern, text)

    # Strip leading and trailing spaces from each sentence
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    
    return sentences

def splitChSentence2Words(sentence):
    # Segment the sentence
    segmented_sentence = HanLP.segment(str(sentence))

    # Extract the segmented tokens
    wordList = [term.word for term in segmented_sentence]
    wordList = [word.strip() for word in wordList if word.strip()]

    return wordList

def splitChWords2EmoGrps(wordList, sepList = emoWordList):
    result = []
    current_sublist = []
    found_separator = False
    found_separator2 = False

    conjunction = ['和', '及', '以及', '或', '或者', '不', '得', '着', '的', '地', '掉']
    supplement = ['不了', '极了', '下去']
    sepList.extend(supplement)
    
    for idx, word in enumerate(wordList):
        current_sublist.append(word)

        if word in sepList:
            found_separator = True
            found_separator2 = True

            # # Create a new sublist if the current word is a separator
            # if word in sepList:
            #     result.append(current_sublist)
            #     current_sublist = []

            # Create a new sublist only if the next word is not a separator
            if ((idx == len(wordList) - 1) or 
                (wordList[idx + 1] not in sepList and
                wordList[idx + 1] not in conjunction and
                wordList[idx + 1] not in supplement)):
                result.append(current_sublist)
                current_sublist = []
                found_separator2 = False

    # Add the last sublist if it's not empty
    if current_sublist and found_separator2:
        result.append(current_sublist)

    # Return None or an empty list if no separator was found
    return result if found_separator else None

def splitChText2EmoGrps(text):
    emoGrps = []

    sentences = splitChText2Sentences(text)
    # print(sentences)

    for sentence in sentences:
        wordList = splitChSentence2Words(sentence)
        subEmoGrps = splitChWords2EmoGrps(wordList)
        if subEmoGrps:
            emoGrps.extend(subEmoGrps)
    
    return emoGrps


def process_chunk(chunk):
    return chunk.apply(splitChText2EmoGrps)

def parallel_apply(df, column_name, func, num_processes=None):
    print('processing')
    
    # Split DataFrame into chunks
    chunks = np.array_split(df[column_name], num_processes)

    start = time.time()

    # Create a multiprocessing pool and process chunks in parallel
    with Pool(processes=num_processes) as pool:
        results = pool.map(func, chunks)

    # total runtime
    runTime = time.time() - start
    print(str(runTime) + ' seconds')

    # Combine the results
    return pd.concat(results)

def main():

    for i in range(1, 30):

        df = pd.read_csv(f'Final Datasets/7.csv segments/df7_ch_for emo split {i}.csv', low_memory=False)

        print('Current segment: ' + str(i))

        # Apply function in parallel
        df['ch_split'] = parallel_apply(df, 'ch', process_chunk, num_processes=12)
        
        # Apply function in parallel
        df['themeCH_split'] = parallel_apply(df, 'themeCH', process_chunk, num_processes=12)

        # Apply function in parallel
        # df['threeEmotionsCH_split'] = parallel_apply(df, 'threeEmotionsCH', process_chunk, num_processes=12)

        # Save the DataFrame to a CSV file
        df.to_csv(f'Final Datasets/7.csv segments/df7_ch_after emo split {i}.csv', index=False)
        # df.to_csv(f'Final Datasets/7.csv segments/df7_ch_synonym_expanded_after emo split {i}.csv', index=False)
        print()

# This is the crucial part
if __name__ == '__main__':
    main()

